{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.display import display, HTML\n",
    "display(HTML(\"<style>.container { width:85% !important; }</style>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from plotnine import *\n",
    "import matplotlib.pyplot as plt\n",
    "print (os.getcwd())\n",
    "#os.chdir('C:\\\\Users\\Benjamin\\Dropbox\\DH\\Eigene\\DLA\\Ergebnisse\\DNBplusDLA4')\n",
    "os.chdir('C:\\\\Users\\Benjamin\\Dropbox\\DH\\Eigene\\GBV\\Ergebnisse\\gbv2020-14_mit_rvk_neue_Genreanalyse2')\n",
    "print (os.getcwd())\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Part1: Reading the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Database of German Literature\n",
    "pd.options.display.max_rows = 50\n",
    "df = pd.read_csv('using_data.csv', delimiter=';', lineterminator='\\n')\n",
    "#df1['counttitle'] = df.groupby('title')['title'].transform('count')\n",
    "df1 = df.groupby('date').count()\n",
    "#df1\n",
    "df1 = df[(df['date'] <= 2020)]\n",
    "df1 = df1.groupby('date').count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# csv with genre appearences\n",
    "import ast\n",
    "pd.options.display.max_rows = 500\n",
    "df_ga = pd.read_csv('genre_appearences.csv', delimiter=';', lineterminator='\\n')\n",
    "\n",
    "df_ga.set_index('Genrebegriff',inplace=True)\n",
    "df_ga = df_ga.T\n",
    "\n",
    "df_ga.index.name = \"date\"\n",
    "df_ga = df_ga.drop(['null'])\n",
    "df_ga.index = pd.to_numeric(df_ga.index)\n",
    "cols=[i for i in df_ga.columns]\n",
    "for col in cols:\n",
    "    df_ga[col]=pd.to_numeric(df_ga[col], errors='coerce')   \n",
    "df_ga.fillna(0, inplace=True)\n",
    "#df_ga"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#csv with full genre matches\n",
    "df_fgm = pd.read_csv('full_genre_matches_year.csv', delimiter=';', lineterminator='\\n')\n",
    "\n",
    "df_fgm.set_index('Genrebegriff',inplace=True)\n",
    "df_fgm = df_fgm.T\n",
    "\n",
    "df_fgm.index.name = \"date\"\n",
    "df_fgm = df_fgm.drop(['null'])\n",
    "df_fgm.index = pd.to_numeric(df_fgm.index)\n",
    "cols=[i for i in df_fgm.columns]\n",
    "for col in cols:\n",
    "    df_fgm[col]=pd.to_numeric(df_fgm[col], errors='coerce')   \n",
    "df_fgm.fillna(0, inplace=True)\n",
    "#df_fgm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#csv with full phrases\n",
    "df_fp = pd.read_csv('full_phrases_p_year.csv', delimiter=';', lineterminator='\\n')\n",
    "df_fp.set_index('Genrebegriff',inplace=True)\n",
    "df_fp = df_fp.T\n",
    "\n",
    "df_fp.index.name = \"date\"\n",
    "df_fp = df_fp.drop(['null'])\n",
    "df_fp.index = pd.to_numeric(df_fp.index)\n",
    "cols=[i for i in df_fp.columns]\n",
    "for col in cols:\n",
    "    df_fp[col]=pd.to_numeric(df_fp[col], errors='coerce')   \n",
    "df_fp.fillna(0, inplace=True)\n",
    "#df_fp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Part 2: Simple Visualizations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = (ggplot(df1.reset_index())\n",
    " + aes(x='date', y='title')\n",
    " + geom_point(size = 1)\n",
    " + labs(title='Literary production in German Language', x='year', y='books')\n",
    ")\n",
    "p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = r\"C:\\Users\\Benjamin\\Dropbox\\Wissenschaft\\Untertitel_und_literarischer_Wandel\\Plots\"\n",
    "p.save(filename = \"plot1.pdf\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)\n",
    "p.save(filename = \"plot1.png\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Anzahl der Titel mit Genrelabel, kopieren in df1\n",
    "auswahl = df_ga [[\"genre label at all\"]]\n",
    "auswahl\n",
    "df1 = pd.concat([df1, auswahl], axis=1)\n",
    "df1[\"books with genre label\"] = (df1[\"genre label at all\"].divide(df1['title'], fill_value = 0))*100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = (ggplot(df1.reset_index())\n",
    " + aes(x='date')\n",
    " + geom_point(aes(y=\"books with genre label\"), size = 1)\n",
    "+ stat_smooth(aes(y=\"books with genre label\"), se=False, span=0.1, color = \"red\")\n",
    " + labs(title='Books with genre label', x='year', y='proportion of literary production (%)')\n",
    ")\n",
    "p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p.save(filename = \"plot2.pdf\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)\n",
    "p.save(filename = \"plot2.png\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculating relative values (%)\n",
    "This step also drops non-sensical genres and adds up the values for different ways of writing certain genres"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ga1 = pd.concat([df1[['title', 'subtitle']], df_ga], axis=1)\n",
    "df_ga[['counts_title', 'counts_subtitle']] = df1[['title', 'subtitle']]\n",
    "#print(df_ga)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "df_ga[\"carmen heroicum/epos\"] = df_ga[\"epos\"] + df_ga[\"carmen heroicum\"]\n",
    "del df_ga['epos']\n",
    "del df_ga['carmen heroicum']\n",
    "\n",
    "df_ga[\"erzählung_gesamt\"] = df_ga.filter(regex=(\"erz.*\")).sum(axis=1)\n",
    "df_ga[\"historie/historia\"] = df_ga[\"historie\"] + df_ga[\"historia\"]\n",
    "del df_ga['historie']\n",
    "del df_ga['historia']\n",
    "\n",
    "df_ga[\"drama/schauspiel\"] = df_ga[\"drama\"] + df_ga[\"schauspiel\"]\n",
    "del df_ga['drama']\n",
    "del df_ga['schauspiel']\n",
    "\n",
    "list_ib = df_ga.columns.values\n",
    "list_ib = np.insert(list_ib, 0, \"carmen heroicum/epos\", axis=0)\n",
    "list_ib = np.insert(list_ib, 0, \"erzählung_gesamt\", axis=0)\n",
    "list_ib = np.insert(list_ib, 0, \"historie/historia\", axis=0)\n",
    "list_ib = np.insert(list_ib, 0, \"drama/schauspiel\", axis=0)\n",
    "\n",
    "j = \"counts_title\"\n",
    "\n",
    "for i in list_ib:\n",
    "    if i == j:\n",
    "        break\n",
    "    else:\n",
    "        #calc = np.where(df_ga[i] is not 'NaN' and df_ga['counts_title'] is not 'NaN', NaN, df_ga[i]/df_ga['counts_title'])\n",
    "        calc = (df_ga[i].divide(df_ga['counts_title'], fill_value = 0))*100\n",
    "        #calc2 = (df_ga[i].divide(df_ga['counts_subtitle'], fill_value = 0))*100\n",
    "        df_ga[str(i) + '_' + 'rel_title'] = calc # Add new column\n",
    "        #df_ga[str(i) + '_' + 'rel_subtitle'] = calc2 # Add new column\n",
    "#print(df_ga)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_fgm1 = pd.concat([df1[['title', 'subtitle']], df_fgm], axis=1)\n",
    "df_fgm[['counts_title', 'counts_subtitle']] = df1[['title', 'subtitle']]\n",
    "#print(df_ga)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "### Zusammenfassung\n",
    "df_fgm[\"gesamt_originalroman\"] = df_fgm[\"origroman\"] + df_fgm[\"originalroman\"]\n",
    "del df_fgm['origroman']\n",
    "del df_fgm['originalroman']\n",
    "\n",
    "df_fgm[\"wild-west-roman\"] = df_fgm[\"wildwestroman\"] + df_fgm[\"wild-westroman\"]\n",
    "del df_fgm['wildwestroman']\n",
    "del df_fgm['wild-westroman']\n",
    "\n",
    "df_fgm[\"characterbild/charakterbild\"] = df_fgm[\"characterbild\"] + df_fgm[\"charakterbild\"]\n",
    "del df_fgm['characterbild']\n",
    "del df_fgm['charakterbild']\n",
    "\n",
    "df_fgm[\"autobiografie/autobiographie\"] = df_fgm[\"autobiografie\"] + df_fgm[\"autobiographie\"]\n",
    "del df_fgm['autobiografie']\n",
    "del df_fgm['autobiographie']\n",
    "\n",
    "\n",
    "\n",
    "### fehlerhafte treffer löschen\n",
    "del df_fgm['perroman']\n",
    "del df_fgm['historischroman']\n",
    "del df_fgm['historischerroman']\n",
    "del df_fgm['geschichtroman']\n",
    "del df_fgm['frühroman']\n",
    "# biografie löschen, da Macrogenre\n",
    "#del df_fgm['biographie']\n",
    "#del df_fgm['biografie']\n",
    "\n",
    "df_fgm.drop(['abbild', 'abgebild',  'arbeitsbuch',\n",
    " 'auschwitz',\n",
    " 'czernowitz',\n",
    " 'ehren-mitglied',\n",
    " 'gebild',\n",
    " 'glied',\n",
    " 'handbuch',\n",
    " 'handschrift',\n",
    " 'jahrbuch',\n",
    " 'menschenbild',\n",
    " 'mitglied',\n",
    " 'preckelwitz',\n",
    " 'thumkirchen',\n",
    " 'titelbild',\n",
    " 'weltbild',\n",
    " 'wirkungsgeschichte',\n",
    " 'wörterbuch',\n",
    "\"festschrift\",\n",
    "\"literaturgeschichte\",\n",
    "\"textbuch\",\n",
    "\"nibelungenlied\",\n",
    "\"spiegelschrift\",\n",
    "\"traurgedicht\",\n",
    "\"mira-valenskykrimi\",\n",
    "\"stammbuch\",\n",
    "\"kulturgeschichte\"], axis=1, inplace=True)\n",
    "\n",
    "#print(df_fgm)\n",
    "\n",
    "list_ib = df_fgm.columns.values\n",
    "list_ib = np.insert(list_ib, 0, \"gesamt_originalroman\", axis=0)\n",
    "list_ib = np.insert(list_ib, 0, \"wild-west-roman\", axis=0)\n",
    "list_ib = np.insert(list_ib, 0, \"characterbild/charakterbild\", axis=0)\n",
    "list_ib = np.insert(list_ib, 0, \"autobiografie/autobiographie\", axis=0)\n",
    "\n",
    "#print(list_ib)\n",
    "\n",
    "j = \"counts_title\"\n",
    "\n",
    "for i in list_ib:\n",
    "    if i == j:\n",
    "        break\n",
    "    else:\n",
    "        #calc = np.where(df_fgm[i] is not 'NaN' and df_fgm['counts_title'] is not 'NaN', NaN, df_fgm[i]/df_fgm['counts_title'])\n",
    "        calc = (df_fgm[i].divide(df_fgm['counts_title'], fill_value = 0))*100\n",
    "        #calc2 = (df_fgm[i].divide(df_fgm['counts_subtitle'], fill_value = 0))*100\n",
    "        df_fgm[str(i) + '_' + 'rel_title'] = calc # Add new column\n",
    "        #df_fgm[str(i) + '_' + 'rel_subtitle'] = calc2 # Add new column\n",
    "\n",
    "print(len(df_fgm))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_fp1 = pd.concat([df1[['title', 'subtitle']], df_fp], axis=1)\n",
    "df_fp[['counts_title', 'counts_subtitle']] = df1[['title', 'subtitle']]\n",
    "#print(df_ga)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "### Zusammenfassung\n",
    "df_fp[\"socialer/sozialer roman\"] = df_fp[\"sozialer roman\"] + df_fp[\"socialer roman\"]\n",
    "del df_fp['sozialer roman']\n",
    "del df_fp['socialer roman']\n",
    "\n",
    "df_fp[\"comischer/komischer roman\"] = df_fp[\"komischer roman\"] + df_fp[\"comischer roman\"]\n",
    "del df_fp['comischer roman']\n",
    "del df_fp['komischer roman']\n",
    "\n",
    "df_fp[\"biographischer/biografischer roman\"] = df_fp[\"biographischer roman\"] + df_fp[\"biografischer roman\"]\n",
    "del df_fp['biographischer roman']\n",
    "del df_fp['biografischer roman']\n",
    "\n",
    "df_fp[\"autobiographischer/autobiografischer roman\"] = df_fp[\"autobiographischer roman\"] + df_fp[\"autobiografischer roman\"]\n",
    "del df_fp['autobiographischer roman']\n",
    "del df_fp['autobiografischer roman']\n",
    "\n",
    "df_fp[\"wahre geschichte/wahre geschichten\"] = df_fp[\"wahre geschichte\"] + df_fp[\"wahre geschichten\"]\n",
    "del df_fp['wahre geschichte']\n",
    "del df_fp['wahre geschichten']\n",
    "\n",
    "df_fp[\"geistliche(r) lieder\"] = df_fp[\"geistliche lieder\"] + df_fp[\"geistlicher lieder\"]\n",
    "del df_fp['geistliche lieder']\n",
    "del df_fp['geistlicher lieder']\n",
    "\n",
    "\n",
    "df_fp = df_fp.filter(regex=r'^((?!andere|ausgewählte|neue|schönste|vermischte|frühe|deutsche|nachgelassene|weitere|gesammelte|beste|\\b[a-z]{1}\\s[a-z]+).)*$')\n",
    "\n",
    "\n",
    "df_fp.drop([ 'folgender ode',\n",
    " 'große buch',\n",
    " 'heiligen schrift',\n",
    " 'kleine buch',\n",
    " 'kleines buch',\n",
    " 'lebende bilder',\n",
    " 'lebenden bildern',\n",
    " 'nachstehender ode',\n",
    " 'vollständiges buch',\n",
    " 'satiren',\n",
    "\"novellen\",\n",
    "\"kleine geschichten\",\n",
    "\"gedichte\",\n",
    "\"geschichten\",\n",
    "\"achleitner roman\",  \n",
    "\"erzaehlungen\",\n",
    " \"gedichte erzählungen\",\n",
    "\"gedichte geschichten\"\n",
    "           ], axis=1, inplace=True)\n",
    "\n",
    "\n",
    "list_ib = df_fp.columns.values\n",
    "list_ib = np.insert(list_ib, 0, \"comischer/komischer roman\", axis=0)\n",
    "list_ib = np.insert(list_ib, 0, \"socialer/sozialer roman\", axis=0)\n",
    "list_ib = np.insert(list_ib, 0, \"biographischer/biografischer roman\", axis=0)\n",
    "list_ib = np.insert(list_ib, 0, \"wahre geschichte/wahre geschichten\", axis=0)\n",
    "list_ib = np.insert(list_ib, 0, \"autobiographischer/autobiografischer roman\", axis=0)\n",
    "list_ib = np.insert(list_ib, 0, \"geistliche(r) lieder\", axis=0)\n",
    "\n",
    "j = \"counts_title\"\n",
    "\n",
    "for i in list_ib:\n",
    "    if i == j:\n",
    "        break\n",
    "    else:\n",
    "        #calc = np.where(df_fp[i] is not 'NaN' and df_fp['counts_title'] is not 'NaN', NaN, df_fp[i]/df_fp['counts_title'])\n",
    "        calc = (df_fp[i].divide(df_fp['counts_title'], fill_value = 0))*100\n",
    "        #calc2 = (df_fp[i].divide(df_fp['counts_subtitle'], fill_value = 0))*100\n",
    "        df_fp[str(i) + '_' + 'rel_title'] = calc # Add new column\n",
    "        #df_fp[str(i) + '_' + 'rel_subtitle'] = calc2 # Add new column\n",
    "print(len(df_fp))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualizing the 10 most common \"adj + roman\" and \"noun + roman\"-terms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "index2 = []\n",
    "df_adj_roman = df_fp.filter(regex=r'r\\sroman_rel_title', axis=1)\n",
    "df_adj_roman = df_adj_roman.drop(2020) # 2020 entfernen\n",
    "print(\"Spalten: \", len(df_adj_roman.columns))\n",
    "new_df = df_adj_roman.sum(axis=0).sort_values(ascending=False).head(10) # erste zehn Titel selektieren\n",
    "print(new_df)\n",
    "index2 = new_df.index # index bzw. column names extrahieren\n",
    "#print(index2)\n",
    "index2 = index2.tolist()\n",
    "#index2 = index2[1:] # historischer Roman rausgenommen\n",
    "df_adj_roman2 = df_adj_roman[index2]\n",
    "#print(df_adj_roman2.reset_index())\n",
    "df_adj_roman2 = df_adj_roman2.rename(columns=lambda x: re.sub(\"\\_rel\\_title\", \"\", x))\n",
    "\n",
    "# from wide to long format\n",
    "df_adj_roman2 = pd.melt(df_adj_roman2.reset_index(),id_vars=['date'],var_name='genre name', value_name='values')\n",
    "#df_adj_roman2[\"genre name\"] = df_adj_roman2[\"genre name\"].astype('category') \n",
    "pd.set_option('display.max_rows', 500)\n",
    "print(df_adj_roman2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# nur historisches Zeug\n",
    "#df_fp.filter(regex=r'hist.*', axis=1).sum().sort_values(ascending=False).head(50)\n",
    "\n",
    "#nach Summe geordnet\n",
    "df_fp.filter(regex=r'roman_rel_title', axis=1).sum().sort_values(ascending=False).head(500)\n",
    "\n",
    "#nach max geordnet\n",
    "#df_fp.filter(regex=r'roman_rel_title', axis=1).max().sort_values(ascending=False).head(50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "p = (ggplot(df_adj_roman2.reset_index(), aes(x='date', y='values'))\n",
    "+ geom_line()\n",
    " + labs(title='books with subtitle adj. + \"Roman\" (novel)', x='year', y='proportion of literary production (%)')\n",
    " + xlim(1770, 2019)\n",
    "     #+ stat_smooth(se=False, span=0.1)\n",
    "    )\n",
    "p= p + facet_wrap(\"genre name\", ncol = 2, scales = \"free_y\", as_table = True) \n",
    "p= p + theme(subplots_adjust={'wspace': 0.15}) #+ theme(panel_spacing_x=.3)\n",
    "p\n",
    "#+ facet_grid(\"~genre name\", scales = \"free\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p.save(filename = \"plot7.pdf\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)\n",
    "p.save(filename = \"plot7.png\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Subset noun + roman"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "index2 = []\n",
    "df_noun_roman = df_fgm.filter(regex=r'roman_rel_title', axis=1)\n",
    "df_noun_roman = df_noun_roman.drop(2020) # 2020 entfernen\n",
    "print(\"Spalten: \", len(df_noun_roman.columns))\n",
    "\n",
    "new_df = df_noun_roman.sum(axis=0).sort_values(ascending=False).head(15) # erste zehn Titel selektieren\n",
    "print(\"Summe der Aneile über gesamte Zeit:\", new_df)\n",
    "\n",
    "#new_df = df_noun_roman.max().sort_values(ascending=False).head(10) # erste zehn Titel selektieren\n",
    "#print(\"Maxima über gesamte Zeit:\", new_df)\n",
    "\n",
    "\n",
    "index2 = new_df.index # index bzw. column names extrahieren\n",
    "index2 = index2.tolist()\n",
    "df_noun_roman2 = df_noun_roman[index2]\n",
    "df_noun_roman2 = df_noun_roman2.rename(columns=lambda x: re.sub(\"\\_rel\\_title\", \"\", x))\n",
    "#print(df_noun_roman2)\n",
    "\n",
    "# from wide to long format\n",
    "df_noun_roman2 = pd.melt(df_noun_roman2.reset_index(),id_vars=['date'],var_name='genre name', value_name='values')\n",
    "pd.set_option('display.max_rows', 500)\n",
    "#print(df_noun_roman2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = (ggplot(df_noun_roman2.reset_index())\n",
    " + aes(x='date', y='values')\n",
    "     + geom_line()\n",
    " + labs(title='books with subtitle noun + \"roman\" (novel)', x='year', y='proportion of literary production (%)')\n",
    " + xlim(1770, 2019)\n",
    "     #+ stat_smooth(se=False, span=0.1)\n",
    "    )\n",
    "p= p + facet_wrap(\"genre name\", ncol = 2, scales = \"free_y\", as_table = True) \n",
    "p= p + theme(subplots_adjust={'wspace': 0.15}) #+ theme(panel_spacing_x=.3)\n",
    "p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p.save(filename = \"plot6.pdf\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)\n",
    "p.save(filename = \"plot6.png\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculate institutional cycles of all compound genres and full-phrases-genres"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "df_fgm_fp = pd.concat([df_fgm, df_fp], axis=1)\n",
    "df_fgm_fp.replace(np.nan, 0, inplace = True)\n",
    "#print(len(df_fgm_fp))\n",
    "\n",
    "# rel_title filtern\n",
    "df_fgm_fp = df_fgm_fp.filter(regex=r'rel_title', axis=1)\n",
    "df_fgm_fp = df_fgm_fp.drop(2020) # 2020 entfernen\n",
    "list_ib = df_fgm_fp.columns.values\n",
    "\n",
    "df_fgm_fp_maxwerte = pd.DataFrame(index=df_fgm_fp.index)\n",
    "df_fgm_fp_maxwerte[\"counts per max\"] = 0\n",
    "df_fgm_fp_local_maxwerte = pd.DataFrame(index=df_fgm_fp.index)\n",
    "df_fgm_fp_local_maxwerte[\"counts per max\"] = 0\n",
    "df_fgm_fp_cyclestart = pd.DataFrame(index=df_fgm_fp.index)\n",
    "df_fgm_fp_cyclestart[\"counts per max\"] = 0\n",
    "df_fgm_fp_cyclen = pd.DataFrame(index=df_fgm_fp.index)\n",
    "df_fgm_fp_cyclestart_first = pd.DataFrame(index=df_fgm_fp.index)\n",
    "df_fgm_fp_cyclestart_first[\"counts per max\"] = 0\n",
    "\n",
    "def find_nonzeros(a):\n",
    "    # Create an array that is 1 where a is nonzero, and pad each end with an extra 0.\n",
    "    isnonzero = np.concatenate(([0], (np.asarray(a) != 0).view(np.int8), [0]))\n",
    "    absdiff = np.abs(np.diff(isnonzero))\n",
    "    # Runs start and end where absdiff is 1.\n",
    "    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)\n",
    "    return ranges\n",
    "\n",
    "i = 0\n",
    "counter = 0\n",
    "persistant_genres = []\n",
    "for i, genre in enumerate(list_ib):\n",
    "    #df_fgm_fp_maxwerte.loc[:, i] = np.array([0] * len(df_fgm_fp)) ## leere Spalte kreieren\n",
    "    # Spalten kopieren und in data umbenennen\n",
    "    df = df_fgm_fp[[genre]].copy()\n",
    "    df = df.rolling(window = 2, min_periods=1).mean().round(5)  ### window-size für rolling-mean bestimmen\n",
    "    try:\n",
    "        df.columns = [\"data\"]\n",
    "    except:\n",
    "        print(\"fehlerhaftes genre: \", genre)\n",
    "        continue\n",
    "    \n",
    "    ##zum sortieren nach maxwert\n",
    "    maxwert = df_fgm_fp[genre].max()\n",
    "    \n",
    "    # Zyklenberechnung\n",
    "    data = df[\"data\"].values\n",
    "    cycles = find_nonzeros(data)\n",
    "    #print(cycles)\n",
    "    diffgreatest = 0\n",
    "    x = 0\n",
    "    foundfirstcycle = False\n",
    "    \n",
    "    ### alle Zyklen größer zehn Jahre mit \"1\" notieren\n",
    "    for start, end in cycles:\n",
    "        diff = end - start\n",
    "        if diffgreatest < diff:\n",
    "            diffgreatest = diff\n",
    "            xgreatest = x\n",
    "        if diff >= 10:\n",
    "            if foundfirstcycle == False:\n",
    "                persistant_genres.append(genre)\n",
    "                cyclestart = cycles[x][0]+1500    \n",
    "                df_fgm_fp_cyclestart_first.at[cyclestart, \"counts per max\"] += 1\n",
    "                print(\"first cycle: \", cycles[x]+1500, genre)\n",
    "                print(\"cycle start: \", cycles[x][0]+1500)\n",
    "                counter += 1\n",
    "            for jahr in range(start, end):\n",
    "                df_fgm_fp_cyclen.at[jahr+1500, genre] = 1\n",
    "            df_fgm_fp_cyclen.at[\"maxwert\", genre] = maxwert\n",
    "            foundfirstcycle = True\n",
    "        x += 1\n",
    "        \n",
    "    \n",
    "    ### längsten (best) und ersten Zyklus finden\n",
    "    try:\n",
    "        bestcycle = cycles[xgreatest]+1500\n",
    "        cyclelength = diffgreatest\n",
    "        cyclestart = cycles[xgreatest][0]+1500        \n",
    "        if cyclelength >= 10:\n",
    "            df_fgm_fp_cyclestart.at[cyclestart, \"counts per max\"] += 1\n",
    "            print(\"Longest cycle:\", bestcycle, genre)\n",
    "            \n",
    "    except:\n",
    "        pass\n",
    "    \n",
    "\n",
    "print(\"Genres taken into account: \", counter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#reorder genres in df \n",
    "df_fgm_fp_cyclen = df_fgm_fp_cyclen.reindex(df_fgm_fp_cyclen.sum().sort_values(ascending=False).index, axis=1)\n",
    "\n",
    "list_genres = df_fgm_fp_cyclen.columns.values\n",
    "i = 0\n",
    "for i, genre in enumerate(list_genres):\n",
    "    print(i, genre)\n",
    "    #df_fgm_fp_cyclen.genre.where(df_fgm_fp_cyclen.genre.isnull(), i, inplace=True)\n",
    "    try:\n",
    "        df_fgm_fp_cyclen.loc[df_fgm_fp_cyclen[genre] == 1, genre] = i\n",
    "    except:\n",
    "        \n",
    "        pass\n",
    "print(df_fgm_fp_cyclen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    df_fgm_fp_cyclen = df_fgm_fp_cyclen.drop(\"maxwert\")\n",
    "except:\n",
    "    pass\n",
    "\n",
    "#umbenennnen\n",
    "df_fgm_fp_cyclen = df_fgm_fp_cyclen.rename(columns=lambda x: re.sub(\"\\_rel\\_title\", \"\", x))\n",
    "\n",
    "# from wide to long format\n",
    "df_fgm_fp_cyclen_wide = pd.melt(df_fgm_fp_cyclen.reset_index(),id_vars=['date'],var_name='genre name', value_name='values')\n",
    "#df_adj_roman2[\"genre name\"] = df_adj_roman2[\"genre name\"].astype('category') \n",
    "pd.set_option('display.max_rows', 1000)\n",
    "#print(df_fgm_fp_cyclen_wide)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = (ggplot(df_fgm_fp_cyclen_wide.iloc[0:20800, :], aes(x=\"date\", y= \"values\", colour=\"genre name\", group = \"genre name\")) \n",
    "     + geom_line()\n",
    "     + xlab(\"year\")\n",
    "     + ylab(\"\") \n",
    "    + xlim(1600, 2020))\n",
    "p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# export data for more pretty graph in R (using \"Iwanthue\")\n",
    "df_fgm_fp_cyclen_wide.to_csv (r'C:\\Users\\Benjamin\\Dropbox\\Wissenschaft\\Fiktion und Genre\\Habil\\R\\df_fgm_fp_cyclen_wide.csv', \n",
    "                              index = False, header=True, encoding = \"UTF8\", sep = \";\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calulate and visualize beginnings of first institutional cycle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "df_fgm_fp = pd.concat([df_fgm, df_fp], axis=1)\n",
    "df_fgm_fp.replace(np.nan, 0, inplace = True)\n",
    "#print(len(df_fgm_fp))\n",
    "\n",
    "# rel_title filtern\n",
    "df_fgm_fp = df_fgm_fp.filter(regex=r'rel_title', axis=1)\n",
    "df_fgm_fp = df_fgm_fp.drop(2020) # 2020 entfernen\n",
    "list_ib = df_fgm_fp.columns.values\n",
    "\n",
    "df_fgm_fp_maxwerte = pd.DataFrame(index=df_fgm_fp.index)\n",
    "df_fgm_fp_maxwerte[\"counts per max\"] = 0\n",
    "df_fgm_fp_local_maxwerte = pd.DataFrame(index=df_fgm_fp.index)\n",
    "df_fgm_fp_local_maxwerte[\"counts per max\"] = 0\n",
    "df_fgm_fp_cyclestart = pd.DataFrame(index=df_fgm_fp.index)\n",
    "df_fgm_fp_cyclestart[\"counts per max\"] = 0\n",
    "df_fgm_fp_cyclen = pd.DataFrame(index=df_fgm_fp.index)\n",
    "df_fgm_fp_cyclestart_first = pd.DataFrame(index=df_fgm_fp.index)\n",
    "df_fgm_fp_cyclestart_first[\"counts per max\"] = 0\n",
    "\n",
    "def find_nonzeros(a):\n",
    "    # Create an array that is 1 where a is nonzero, and pad each end with an extra 0.\n",
    "    isnonzero = np.concatenate(([0], (np.asarray(a) != 0).view(np.int8), [0]))\n",
    "    absdiff = np.abs(np.diff(isnonzero))\n",
    "    # Runs start and end where absdiff is 1.\n",
    "    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)\n",
    "    return ranges\n",
    "\n",
    "i = 0\n",
    "counter = 0\n",
    "persistant_genres = []\n",
    "for i, genre in enumerate(list_ib):\n",
    "    #df_fgm_fp_maxwerte.loc[:, i] = np.array([0] * len(df_fgm_fp)) ## leere Spalte kreieren\n",
    "    # Spalten kopieren und in data umbenennen\n",
    "    df = df_fgm_fp[[genre]].copy()\n",
    "    df = df.rolling(window = 3, min_periods=1).mean().round(5)  ### window-size für rolling-mean bestimmen\n",
    "    try:\n",
    "        df.columns = [\"data\"]\n",
    "    except:\n",
    "        print(\"fehlerhaftes genre: \", genre)\n",
    "        continue\n",
    "    \n",
    "    ##zum sortieren nach maxwert\n",
    "    maxwert = df_fgm_fp[genre].max()\n",
    "    \n",
    "    # Zyklenberechnung\n",
    "    data = df[\"data\"].values\n",
    "    cycles = find_nonzeros(data)\n",
    "    #print(cycles)\n",
    "    diffgreatest = 0\n",
    "    x = 0\n",
    "    foundfirstcycle = False\n",
    "    \n",
    "    ### alle Zyklen größer zehn Jahre mit \"1\" notieren\n",
    "    for start, end in cycles:\n",
    "        diff = end - start\n",
    "        if diffgreatest < diff:\n",
    "            diffgreatest = diff\n",
    "            xgreatest = x\n",
    "        if diff >= 10:\n",
    "            if foundfirstcycle == False:\n",
    "                persistant_genres.append(genre)\n",
    "                cyclestart = cycles[x][0]+1500    \n",
    "                df_fgm_fp_cyclestart_first.at[cyclestart, \"counts per max\"] += 1\n",
    "                print(\"first cycle: \", cycles[x]+1500, genre)\n",
    "                print(\"cycle start: \", cycles[x][0]+1500)\n",
    "                counter += 1\n",
    "            for jahr in range(start, end):\n",
    "                df_fgm_fp_cyclen.at[jahr+1500, genre] = 1\n",
    "            df_fgm_fp_cyclen.at[\"maxwert\", genre] = maxwert\n",
    "            foundfirstcycle = True\n",
    "        x += 1\n",
    "        \n",
    "    \n",
    "    ### längsten (best) und ersten Zyklus finden\n",
    "    try:\n",
    "        bestcycle = cycles[xgreatest]+1500\n",
    "        cyclelength = diffgreatest\n",
    "        cyclestart = cycles[xgreatest][0]+1500        \n",
    "        if cyclelength >= 10:\n",
    "            df_fgm_fp_cyclestart.at[cyclestart, \"counts per max\"] += 1\n",
    "            print(\"Longest cycle:\", bestcycle, genre)\n",
    "            \n",
    "    except:\n",
    "        pass\n",
    "    \n",
    "\n",
    "print(\"Genres taken into account: \", counter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = (ggplot(df_fgm_fp_cyclestart_first.reset_index())\n",
    " + aes(x = \"date\")\n",
    " + geom_point(aes(y='counts per max'), color='black', size = 0.5)\n",
    "+ stat_smooth(aes(y='counts per max'), se=False, span=0.1, color = \"red\")\n",
    "\n",
    " + labs(title='Beginnings of first institutional cycle per genre', x='year', y='beginnings')\n",
    " + xlim(1500, 2019)\n",
    "+ ylim(0,15))\n",
    "p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p.save(filename = \"plot9.pdf\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)\n",
    "p.save(filename = \"plot9.png\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualize truth-signaling genre labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Wahre Geschichten ###\n",
    "df_fgm_fp = pd.concat([df_fgm, df_fp], axis=1)\n",
    "index2 = []\n",
    "df_wahre = pd.DataFrame()\n",
    "\n",
    "# detailliert\n",
    "\"\"\"df_wahre[\"wahre + X [true X]\"] = df_fgm_fp.filter(regex=r'\\bwahre.+_rel_title|\\bware.+_rel_title', axis=1).sum(axis=1)\n",
    "df_wahre[\"wahrhaftige + X [truthful X]\"] = df_fgm_fp.filter(regex=r'\\bwahrh.+_rel_title|\\bwarh.+_rel_title', axis=1).sum(axis=1)\n",
    "df_wahre[\"Wahrheitsgetreue / getreu + X\"] = df_fgm_fp.filter(regex=r'getreu.+_rel_title', axis=1).sum(axis=1)\n",
    "df_wahre[\"wahrscheinliche + X [probable X]\"] = df_fgm_fp.filter(regex=r'\\bwahrscheinlich.+_rel_title|\\bwarscheinlich.+_rel_title', axis=1).sum(axis=1)\n",
    "df_wahre[\"Tatsachen + X [fact X]\"] = df_fgm_fp.filter(regex=r'\\btatsachen.+_rel_title', axis=1).sum(axis=1)\n",
    "df_wahre[\"Dokumentar + X / dokumentarische X [documentary X]\"] = df_fgm_fp.filter(regex=r'\\bdokumentar.+_rel_title|\\bdocumentar.+_rel_title', axis=1).sum(axis=1)\n",
    "df_wahre[\"Reportage + X [reportage + X]\"] = df_fgm_fp.filter(regex=r'\\breportage.+_rel_title', axis=1).sum(axis=1)\n",
    "df_wahre[\"Sach + X (except Sachbuch) [factual X]\"] = df_fgm_fp.filter(regex=r'(?!sachbuch)\\bsach.+_rel_title', axis=1).sum(axis=1) # negative lookahead\n",
    "\"\"\"\n",
    "#print(df_wahre4.sum(axis=0).head(20))\n",
    "#df_wahre[\"erfundene / fiktive + X\"] = df_fgm_fp.filter(regex=r'\\bfiktive.+_rel_title|\\berfundene.+_rel_title', axis=1).sum(axis=1)\n",
    "#df_wahre[\"true stories aggregated\"] = df_wahre.sum(axis = 1)\n",
    "\n",
    "# aufsummierte Kategorien\n",
    "df_wahre[\"wahre + X (true X)\"] = df_fgm_fp.filter(regex=r'\\bwahre.+_rel_title|\\bware.+_rel_title', axis=1).sum(axis=1)\n",
    "df_wahre[\"wahrhaftige + X (truthful X)\"] = df_fgm_fp.filter(regex=r'\\bwahrh.+_rel_title|\\bwarh.+_rel_title', axis=1).sum(axis=1)\n",
    "#df_wahre[\"Wahrheitsgetreue / getreu + X\"] = df_fgm_fp.filter(regex=r'getreu.+_rel_title', axis=1).sum(axis=1)\n",
    "#df_wahre[\"wahrscheinliche + X [probable X]\"] = df_fgm_fp.filter(regex=r'\\bwahrscheinlich.+_rel_title|\\bwarscheinlich.+_rel_title', axis=1).sum(axis=1)\n",
    "df_wahre[\"Dokumentar/ dokumentarische(r) + X \\nReportage + X \\nSach + X (except >Sachbuch<)\\nTatsachen + X\"] = df_fgm_fp.filter(regex=r'\\btatsachen.+_rel_title|\\bdokumentar.+_rel_title|\\bdocumentar.+_rel_title|\\breportage.+_rel_title|(?!sachbuch)\\bsach.+_rel_title', axis=1).sum(axis=1)\n",
    "#df_wahre[\"factual genres\"] = df_ga.filter(regex=r'\\baufzeichnung.*_rel_title|\\bbericht.*_rel_title|\\bbiografie.*_rel_title|\\bdokumentation.*_rel_title|\\bessay.*_rel_title|\\breportage.*_rel_title', axis=1).sum(axis=1)\n",
    "#df_wahre[\"Sachbuch\"] = df_fgm_fp.filter(regex=r'\\bsachbuch', axis=1).sum(axis=1)\n",
    "\n",
    "#print(df_wahre.sum(axis=0).sort_values(ascending=False).head(20))\n",
    "\n",
    "# from wide to long format\n",
    "df_wahre_long = pd.melt(df_wahre.reset_index(),id_vars=['date'],var_name='genre label', value_name='values')\n",
    "pd.set_option('display.max_rows', 500)\n",
    "df_wahre_long.replace(0, np.nan, inplace = True)\n",
    "#print(df_wahre_long)\n",
    "\n",
    "p = (ggplot(df_wahre_long.reset_index())\n",
    " + aes(x='date', y='values', colour = \"genre label\" )\n",
    " + geom_point(size = 1, alpha=0.6)\n",
    "#+ stat_smooth(aes(y=\"values\"), span=0.13)\n",
    " + labs(title='Truth-signaling labels', x='year', y='proportion of literary production (%)')\n",
    " + xlim(1500, 2019)\n",
    "    + ylim(0,6)\n",
    "             )\n",
    "p = p + theme(legend_title = element_blank())\n",
    "p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p.save(filename = \"plot12.pdf\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)\n",
    "p.save(filename = \"plot12.png\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualize history related genre labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Geschichtsbezogene Geschichten ###\n",
    "df_wahre2 = pd.DataFrame()\n",
    "df_wahre2 = pd.DataFrame(index=df_wahre.index)\n",
    "\n",
    "# detailliert\n",
    "df_wahre2[\"geschichtlich + X / \\nGeschicht(s) + X (except >Geschichte<) \"] = df_fgm_fp.filter(regex=r'geschichtlich.+_rel_title|(?!geschichte)geschicht.+_rel_title', axis=1).sum(axis=1)\n",
    "df_wahre2[\"historisch + X\"] = df_fgm_fp.filter(regex=r'historisch.+_rel_title', axis=1).sum(axis=1)\n",
    "df_wahre2[\"historie / historia\"] = df_ga.filter(regex=r'historie.+_rel_title|historia.+_rel_title|histori\\b.+_rel_title', axis=1).sum(axis=1)\n",
    "#df_wahre2[\"chronik\"] = df_ga.filter(regex=r'chronik.+_rel_title', axis=1).sum(axis=1)\n",
    "\n",
    "\n",
    "\"\"\"# aufsummierte Kategorien\n",
    "df_wahre[\"wahre + X [true X]\"] = df_fgm_fp.filter(regex=r'\\bwahre.+_rel_title|\\bware.+_rel_title', axis=1).sum(axis=1)\n",
    "df_wahre[\"wahrhaftige + X [truthful X]\"] = df_fgm_fp.filter(regex=r'\\bwahrh.+_rel_title|\\bwarh.+_rel_title', axis=1).sum(axis=1)\n",
    "#df_wahre[\"Wahrheitsgetreue / getreu + X\"] = df_fgm_fp.filter(regex=r'getreu.+_rel_title', axis=1).sum(axis=1)\n",
    "#df_wahre[\"wahrscheinliche + X [probable X]\"] = df_fgm_fp.filter(regex=r'\\bwahrscheinlich.+_rel_title|\\bwarscheinlich.+_rel_title', axis=1).sum(axis=1)\n",
    "df_wahre[\"Dokumentar/ dokumentarische(r) + X \\nReportage + X \\nSach + X (except >Sachbuch<)\\nTatsachen + X\"] = df_fgm_fp.filter(regex=r'\\btatsachen.+_rel_title|\\bdokumentar.+_rel_title|\\bdocumentar.+_rel_title|\\breportage.+_rel_title|(?!sachbuch)\\bsach.+_rel_title', axis=1).sum(axis=1)\n",
    "\"\"\"\n",
    "#print(df_wahre2.sum(axis=0).sort_values(ascending=False).head(20))\n",
    "\n",
    "# wahre geschichten und hist. genres zusammen\n",
    "#print(\"df_wahre: \\n\", df_wahre)\n",
    "#print(\"df_wahre2: \\n\", df_wahre2)\n",
    "#df_wahre2 = pd.concat([df_wahre2,df_wahre], axis=1)\n",
    "#print(\"df_wahre2 + df_wahre: \\n\", df_wahre2)\n",
    "\n",
    "# from wide to long format\n",
    "df_wahre2_long = pd.melt(df_wahre2.reset_index(),id_vars=['date'],var_name='genre label', value_name='values')\n",
    "pd.set_option('display.max_rows', 500)\n",
    "df_wahre2_long.replace(0, np.nan, inplace = True)\n",
    "#print(df_wahre2)\n",
    "\n",
    "p = (ggplot(df_wahre2_long.reset_index())\n",
    " + aes(x='date', y='values', colour = \"genre label\" )\n",
    " + geom_point(size = 1, alpha=0.6)\n",
    "#+ stat_smooth(aes(y=\"values\"), span=0.13)\n",
    " + labs(title='History-related labels', x='year', y='proportion of literary production (%)')\n",
    " + xlim(1500, 2019)\n",
    "     + ylim(0,16)\n",
    "             )\n",
    "p = p + theme(legend_title = element_blank())\n",
    "p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p.save(filename = \"plot11.pdf\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)\n",
    "p.save(filename = \"plot11.png\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualize truth-signaling and history-related genres in one plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# wahre geschichten und hist. genres zusammen\n",
    "df_wahre3 = pd.DataFrame()\n",
    "df_wahre3[\"history-related labels\"] = df_wahre2.sum(axis=1)\n",
    "df_wahre3[\"truth-signaling labels\"] = df_wahre.sum(axis=1)\n",
    "#df_wahre3 = pd.concat([df_wahre2,df_wahre], axis=1)\n",
    "\n",
    "# from wide to long format\n",
    "df_wahre3_long = pd.melt(df_wahre3.reset_index(),id_vars=['date'],var_name='genre label', value_name='values')\n",
    "pd.set_option('display.max_rows', 500)\n",
    "df_wahre3_long.replace(0, np.nan, inplace = True)\n",
    "#print(df_wahre2)\n",
    "\n",
    "p = (ggplot(df_wahre3_long.reset_index())\n",
    " + aes(x='date', y='values', colour = \"genre label\" )\n",
    " #+ geom_point(size = 1, alpha=0.8)\n",
    "+ stat_smooth(aes(y=\"values\"), span=0.15)\n",
    " + labs(title='The parting of the ways: history-related and truth-signaling labels', x='year', y='proportion of literary production (%)')\n",
    " + xlim(1500, 2019)\n",
    "     + ylim(0,6)\n",
    "             )\n",
    "p = p + theme(legend_title = element_blank())\n",
    "p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p.save(filename = \"plot13.pdf\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)\n",
    "p.save(filename = \"plot13.png\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Most common major genre labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ga_auswahl = df_ga[['roman_rel_title', 'erzählung_gesamt_rel_title', 'geschichte_rel_title', 'novelle_rel_title', 'lied_rel_title', \\\n",
    "                                    'gedicht_rel_title', 'historie/historia_rel_title', 'carmen heroicum/epos_rel_title', \"brief_rel_title\", \\\n",
    "                       \"bericht_rel_title\", \"drama/schauspiel_rel_title\"]] \n",
    "#print(df_ga_auswahl)\n",
    "df_ga_auswahlwide = pd.melt(df_ga_auswahl.reset_index(),id_vars=['date'],var_name='genre_name', value_name='values')\n",
    "pd.set_option('display.max_rows', 500)\n",
    "#print(df_ga_auswahlwide)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from palettable.colorbrewer.qualitative import Set1_8\n",
    "df_ga_auswahl = df_ga[['roman_rel_title', 'erzählung_gesamt_rel_title', 'geschichte_rel_title', 'novelle_rel_title', 'lied_rel_title', \\\n",
    "                                    'gedicht_rel_title', 'historie/historia_rel_title', 'carmen heroicum/epos_rel_title', \"brief_rel_title\", \\\n",
    "                       \"bericht_rel_title\", \"drama/schauspiel_rel_title\"]] \n",
    "df_ga_auswahl = df_ga_auswahl.rename(columns=lambda x: re.sub(\"\\_rel\\_title\", \"\", x))\n",
    "df_ga_auswahl_roll = df_ga_auswahl.rolling(window = 5, min_periods=2).mean()\n",
    "df_ga_auswahl_roll_wide = pd.melt(df_ga_auswahl_roll.reset_index(),id_vars=['date'],var_name='genre_name', value_name='values')\n",
    "pd.set_option('display.max_rows', 500)\n",
    "#print(df_ga_auswahl_roll_wide)\n",
    "p = (ggplot(df_ga_auswahl_roll_wide.reset_index(), aes(x=\"date\", y= \"values\", fill=\"genre_name\")) +\n",
    "  geom_area()\n",
    "     + labs(title='Presence of major genre labels, 1500-1800', x='year', y='proportion of literary production (%)')\n",
    "      + xlim(1500, 1800)\n",
    "   # + scale_fill_brewer(type = \"qual\", palette = \"Paired\")\n",
    " # + scale_y_continuous(limits = c(0,50)) +\n",
    "  #scale_x_continuous(limits = c(1820, 1920), breaks = (c(1800,1820,1840,1860,1880,1900, 1920)))+\n",
    "  #guides(fill=guide_legend(nrow=3))\n",
    "    )\n",
    "p\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p.save(filename = \"plot3.pdf\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)\n",
    "p.save(filename = \"plot3.png\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from palettable.colorbrewer.qualitative import *\n",
    "df_ga_auswahl = df_ga[['roman_rel_title', 'erzählung_gesamt_rel_title', 'geschichte_rel_title', 'novelle_rel_title', 'lied_rel_title', \\\n",
    "                                    'gedicht_rel_title', 'historie/historia_rel_title', 'carmen heroicum/epos_rel_title', \"brief_rel_title\", \\\n",
    "                       \"bericht_rel_title\", \"drama/schauspiel_rel_title\"]] \n",
    "df_ga_auswahl_roll = df_ga_auswahl.rolling(window = 1, min_periods=1).mean()\n",
    "df_ga_auswahl_roll_wide = pd.melt(df_ga_auswahl_roll.reset_index(),id_vars=['date'],var_name='genre_name', value_name='values')\n",
    "pd.set_option('display.max_rows', 500)\n",
    "#print(df_ga_auswahl_roll_wide)\n",
    "p = (ggplot(df_ga_auswahl_roll_wide.reset_index(), aes(x=\"date\", y= \"values\", fill=\"genre_name\")) +\n",
    "  geom_area()\n",
    "      + labs(title='Presence of major genre labels, 1800-2020', x='year', y='proportion of literary production (%)')+\n",
    "  xlab(\"year\")+\n",
    "  ylab(\"proportion of literary production (%)\") \n",
    "      + xlim(1800, 2020)\n",
    " #    + scale_fill_brewer(type = \"div\", palette = \"Spectral\", direction = 1)\n",
    " # + scale_y_continuous(limits = c(0,50)) +\n",
    "  #scale_x_continuous(limits = c(1820, 1920), breaks = (c(1800,1820,1840,1860,1880,1900, 1920)))+\n",
    "  #guides(fill=guide_legend(nrow=3))\n",
    "    )\n",
    "p\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p.save(filename = \"plot4.pdf\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)\n",
    "p.save(filename = \"plot4.png\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculate means of major genre labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ga_auswahl_roll_1500_1800 = df_ga_auswahl_roll.iloc[0:301, :]\n",
    "df_ga_auswahl_roll_1500_1800.mean(axis=0).round(1).sort_values(ascending=False).head(25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ga_auswahl_roll_1800_2020 = df_ga_auswahl_roll.iloc[301:521, :]\n",
    "df_ga_auswahl_roll_1800_2020.mean(axis=0).round(1).sort_values(ascending=False).head(25)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Analysing the relation between genre label and genre production (using data from Mühlberger, Habitzel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import chardet    \n",
    "rawdata = open('C:\\\\Users\\Benjamin\\Dropbox\\DH\\Eigene\\GBV\\Daten\\Historischer_Roman_Konjunkturen.csv', 'rb').read()\n",
    "result = chardet.detect(rawdata)\n",
    "charenc = result['encoding']\n",
    "print(charenc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "HR = pd.read_csv('C:\\\\Users\\Benjamin\\Dropbox\\DH\\Eigene\\GBV\\Daten\\Historischer_Roman_Konjunkturen.csv', encoding='ISO-8859-1', delimiter=';', lineterminator='\\n', low_memory=False)\n",
    "HR.index = HR.index + 1785\n",
    "#HR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Kopieren der Daten aus df_fp (full phrases)\n",
    "new = df_fp[[\"historischer roman\", \"historischer roman_rel_title\", \"counts_title\"]]\n",
    "new.columns =[\"counts_genre_term\", \"genre_term_rel_titles\", \"counts_title\"] # umbenennen der Spalten\n",
    "\n",
    "HR = pd.concat([new, HR], axis=1)\n",
    "pd.set_option('display.max_rows', 1000)\n",
    "#HR[\"historischer Roman\"].replace(NA, 0)\n",
    "HR[\"HR_rel_title\"] = (HR[\"Anzahl_HR\"].divide(HR[\"counts_title\"], fill_value = 0))*100\n",
    "HR.index.name = \"date\"\n",
    "df_HR = HR[[\"genre_term_rel_titles\", \"HR_rel_title\"]]\n",
    "df_HR.rename(columns={'genre_term_rel_titles': 'genre label', 'HR_rel_title': 'genre production'}, inplace=True)\n",
    "df_HR\n",
    "# from wide to long format\n",
    "df_HR2 = pd.melt(df_HR.reset_index(),id_vars=['date'],var_name='genre name', value_name='values')\n",
    "df_HR2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = (ggplot(df_HR2.reset_index(), aes(x=\"date\", y= \"values\", color=\"genre name\"))\n",
    "+ geom_line() + labs(title='Genre production and genre label: the historical novel', x='year', y='proportion of literary production (%)')\n",
    " + xlim(1775, 1940)\n",
    "     + scale_color_brewer(type = \"qual\", palette = \"Set1\", direction = 1)\n",
    "    +geom_vline(xintercept = 1841, linetype=\"dotted\")\n",
    "    +geom_vline(xintercept = 1865, linetype=\"dotted\")\n",
    "    +geom_vline(xintercept = 1913, linetype=\"dotted\")\n",
    "    +geom_vline(xintercept = 1861, linetype=\"dotted\")\n",
    "    +geom_vline(xintercept = 1851, linetype=\"dotted\")\n",
    "    +geom_vline(xintercept = 1889, linetype=\"dotted\"))\n",
    "p\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p.save(filename = \"plot5.pdf\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)\n",
    "p.save(filename = \"plot5.png\", width=None, height=None, dpi=1200, device = \"pdf\", path = path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculation the correlation between label and production"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "HR_for_corr = HR[285:441]\n",
    "HR_for_corr = HR_for_corr[[\"HR_rel_title\", \"genre_term_rel_titles\"]]\n",
    "HR_for_corr\n",
    "# Spearmans Rho\n",
    "HR_for_corr.corr(method='spearman')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## K-Means Clustering using DTW"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Formatierung der Daten\n",
    "import numpy\n",
    "import math\n",
    "from tslearn.utils import to_time_series_dataset\n",
    "\n",
    "list_of_time_series = []\n",
    "\n",
    "df_fgm_fp2 = df_fgm_fp\n",
    "df_fgm_fp2 = df_fgm_fp2[persistant_genres]\n",
    "try:\n",
    "    df_fgm_fp2 = df_fgm_fp2.drop(2020) # 2020 entfernen\n",
    "except:\n",
    "    pass\n",
    "\n",
    "\n",
    "# remove leading zeros in data for every genre\n",
    "df_fgm_fp2_ohne_nullen = df_fgm_fp2.copy()\n",
    "i = 0\n",
    "for i in df_fgm_fp2.columns:\n",
    "    s = df_fgm_fp2[i]\n",
    "    value = s.iloc[0]\n",
    "    \n",
    "    while value == 0 or math.isnan(value):\n",
    "        s = s.loc[s.first_valid_index():]\n",
    "        s = numpy.trim_zeros(s, \"f\")\n",
    "        value = s.iloc[0]   \n",
    "    df_fgm_fp2_ohne_nullen[i] = pd.Series(s)\n",
    "    list_of_time_series.append(s)\n",
    "\n",
    "pd.set_option('display.max_columns', 20)\n",
    "pd.set_option('display.max_columns', 20)\n",
    "X_train = to_time_series_dataset(list_of_time_series)\n",
    "\n",
    "# Standard Variante\n",
    "#X_train = to_time_series_dataset(df_fgm_fp2.T)\n",
    "\n",
    "#print(X_train.shape)\n",
    "#print(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from tslearn.clustering import TimeSeriesKMeans\n",
    "from tslearn.datasets import CachedDatasets\n",
    "from tslearn.preprocessing import TimeSeriesScalerMeanVariance, \\\n",
    "    TimeSeriesResampler\n",
    "\n",
    "seed = 0\n",
    "numpy.random.seed(seed)\n",
    "#numpy.random.shuffle(X_train)\n",
    "\n",
    "# Keep only 50 time series\n",
    "counter = 0\n",
    "\n",
    "X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train)\n",
    "print(X_train.shape)\n",
    "\n",
    "# Make time series shorter\n",
    "X_train = TimeSeriesResampler(sz=50).fit_transform(X_train)\n",
    "\n",
    "#print(\"resamplet: \", X_train)\n",
    "sz = X_train.shape[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.pyplot import figure\n",
    "dba_inertia = {}\n",
    "sdtw_inertia = {}\n",
    "dba_inertias = []\n",
    "sdtw_inertias = []\n",
    "\n",
    "# set number of clusters\n",
    "K = range(2,11)\n",
    "\n",
    "for nr_clusters in K:\n",
    "    figure(num=None, figsize=(16, 12), dpi=1200, facecolor='w', edgecolor='k', tight_layout=True)\n",
    "    print(\"Zahl der Cluster:\", nr_clusters)\n",
    "    \n",
    "    # Soft-DTW-k-means\n",
    "    print(\"Soft-DTW k-means\")\n",
    "    sdtw_km = TimeSeriesKMeans(n_clusters=nr_clusters,\n",
    "                               metric=\"softdtw\",\n",
    "                               metric_params={\"gamma\": .01},\n",
    "                               verbose=False,\n",
    "                               random_state=seed)\n",
    "    y_pred = sdtw_km.fit_predict(X_train)\n",
    "\n",
    "    for yi in range(nr_clusters):\n",
    "        plt.subplot(3, nr_clusters, 1 + 2*nr_clusters + yi)\n",
    "        for xx in X_train[y_pred == yi]:\n",
    "            plt.plot(xx.ravel(), \"k-\", alpha=.2)\n",
    "        plt.plot(sdtw_km.cluster_centers_[yi].ravel(), \"r-\")\n",
    "        plt.xlim(0, sz)\n",
    "        plt.ylim(-4, 4)\n",
    "        plt.text(0.55, 0.15,'Cluster %d' % (yi + 1),\n",
    "                 transform=plt.gca().transAxes)\n",
    "        if yi == 1:\n",
    "            plt.title(\"Soft-DTW $k$-means\")\n",
    "            \n",
    "                                  \n",
    "    plt.tight_layout()\n",
    "    filename = \"dtw\" + str(nr_clusters) + \".png\"\n",
    "    path = r'C:\\Users\\Benjamin\\Dropbox\\Wissenschaft\\Untertitel_und_literarischer_Wandel\\Plots\\Temp'\n",
    "    #plt.savefig(path + \"\\\\\" + filename, dpi=1200)\n",
    "    plt.show()\n",
    "    #fig = plt.figure()\n",
    "    #fig.tight_layout()\n",
    "    #plt.close(fig)\n",
    "    \n",
    "    # save inertia (within-cluster sum-of-squares)\n",
    "    sdtw_inertias.append(sdtw_km.inertia_)\n",
    "    sdtw_inertia[nr_clusters] = sdtw_km.inertia_\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Soft DTW inertia: \\n\")\n",
    "for key,val in sdtw_inertia.items(): \n",
    "    print(str(key)+' : '+str(val)) \n",
    "\n",
    "plt.plot(K, sdtw_inertias, 'bx-') \n",
    "plt.xlabel('Values of K') \n",
    "plt.ylabel('Inertia') \n",
    "plt.title('The Elbow Method using Inertia, softdtw') \n",
    "#plt.savefig(path + \"\\\\\" + \"soft-DTW ellbow\", dpi=1200)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# centroid of cluster 0 (=1)\n",
    "#print(sdtw_km.cluster_centers_[0])\n",
    "print(sdtw_km.cluster_centers_.shape)\n",
    "print(\"Anzahl Labels: \", len(sdtw_km.labels_))\n",
    "print(\"iterationen\", sdtw_km.n_iter_ )\n",
    "print(\"Labels: \", sdtw_km.labels_)\n",
    "print(sdtw_km.inertia_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### get cluster number for every genre"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pd.set_option('display.max_columns', 500)\n",
    "pd.set_option('display.max_rows', 500)\n",
    "\n",
    "df_fgm_fp_clustered = df_fgm_fp2.copy()\n",
    "df_fgm_fp_clustered.loc[len(df_fgm_fp_clustered)] = sdtw_km.labels_ + 1 ### ACHTUNG: Zählung der Cluster jetzt wie in Diagramm\n",
    "\n",
    "df_fgm_fp_clustered = df_fgm_fp_clustered.reindex(sorted(df_fgm_fp_clustered.columns), axis=1)\n",
    "len(df_fgm_fp_clustered)\n",
    "cluster = df_fgm_fp_clustered.iloc[len(df_fgm_fp_clustered)-1]\n",
    "i = 1\n",
    "for i in range(i, nr_clusters+1):\n",
    "    print(\"\\nCluster\", i,  cluster[cluster == i])\n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
